boxplots#
This page contains instructions and documentation for creating plots used to visualize curve ensembles.
spaghetti_plot#
Plots a random selection of curves.
Parameters#
client (bigquery.Client): BigQuery client object.
table_name (str): BigQuery table name containing data in ‘dataset.table’ form.
reference_table (str): BigQuery table name containing reference table in ‘dataset.table’ form.
geo_level (str): The name of a column from the reference table. The geographical level used to determine what places are included.
geo_values (str or listlike or None): The source(s) to be included. A value or subset of values from the geo_level column. If None, then all values will be included.
geo_column (str, optional): Name of column in original table containing geography identifier. Defaults to ‘basin_id’.
reference_column (str, optional): Name of column in original table containing the geography corresponding to data in source_column and target_column. Defaults to ‘basin_id’.
value (str, optional): Name of column in the original table containing the importation value to be analyzed. Defaults to ‘value’.
n (int, optional): Number of curves to plot. Defaults to 25.
Returns#
fig (plotly.graph_objects.Figure): Plotly Figure containing visualization.
Example#
import epidemic_intelligence as ei
from google.oauth2 import service_account
from google.cloud import bigquery
credentials = service_account.Credentials.from_service_account_file('../../../credentials.json') # use the path to your credentials
project = 'net-data-viz-handbook' # use your project name
# Initialize a GC client
client = bigquery.Client(credentials=credentials, project=project)
table_name = 'h1n1_R2.basins_prevalence_agg'
reference_table = 'reference.gleam-geo-map'
reference_column = 'basin_id' # name of a column in reference table
geo_column = 'basin_id' # name of a column in table corresponding to column in reference table
geo_level = 'basin_label'
geo_values = 'Portland(US-ME)'
value = 'Infectious_18_23'
sp_fig = ei.spaghetti_plot(
client=client,
table_name=table_name,
reference_table=reference_table,
geo_level=geo_level,
geo_values=geo_values,
geo_column=geo_column,
reference_column=reference_column,
value=value,
n=100)
# finishing touches
sp_fig.update_layout(width=900, height=500,
showlegend=True,
font_family='PT Sans Narrow',
title='Spaghetti Plot',)
sp_fig.show()
C:\Users\elija\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\google\cloud\bigquery\table.py:1727: UserWarning: BigQuery Storage module not found, fetch data with the REST endpoint instead.
warnings.warn(
functional_boxplot#
A functional boxplot uses curve-based statistics that treat entire curves as a single data point, as opposed to each observation in a curve. Always plots the median and interquartile range.
Parameters#
client (bigquery.Client): BigQuery client object.
table_name (str): BigQuery table name containing data in ‘dataset.table’ form.
reference_table (str): BigQuery table name containing reference table in ‘dataset.table’ form.
geo_level (str): The name of a column from the reference table. The geographical level used to determine what places are included.
geo_values (str or listlike or None): The source(s) to be included. A value or subset of values from the geo_level column. If None, then all values will be included.
geo_column (str, optional): Name of column in original table containing geography identifier. Defaults to ‘basin_id’.
reference_column (str, optional): Name of column in original table containing the geography corresponding to data in source_column and target_column. Defaults to ‘basin_id’.
value (str, optional): Name of column in the original table containing the importation value to be analyzed. Defaults to ‘value’.
num_clusters (int, optional): Number of clusters that curves will be broken into based on grouping_method. Defaults to 1. Note: raising num_clusters above one significantly increases runtime.
num_features (int, optional): Number of features the kmeans algorithm will use to group curves if num_clusters in greater than 1. Must be less than or equal to number of run_ids in table.
grouping_method (str, optional): Method used to group curves. Must be one of:
'mse'(default): Fixed-time pairwise mean squared error between curves.'abc': Fixed-time pairwise area between curves. Also called mean absolute error.
kmeans_table (str, optional): BigQuery table name containing clustering information in ‘dataset.table’ form. Used when kmeans has already been performed with delete_data=False. Allows function to skip costly kmeans algorithm.
centrality_method (str, optional): Method used to determine curve centrality within their group. Must be one of:
'mse'(default): Summed fixed-time mean squared error between curves.'abc': Summed fixed-time pairwise area between curves. Also called mean absolute error.'mbd': Modified band depth. For more information, see Sun and Genton (2011).
threshold (float, optional): Number of interquantile ranges from median curve must be to not be considered an outlier. Defaults to 1.5.
dataset (str or None, optional): Name of BigQuery dataset to store intermediate tables. If None, then random hash value will be used. Defaults to None.
delete_data (bool, optional): If True, then intermediate data tables will not be deleted. Defaults to False.
overwrite (bool, optional): If True, then will not prompt for confirmation if overwriting an existing BigQuery dataset. Defaults to False.
Returns#
fig (plotly.graph_objects.Figure): Plotly Figure containing visualization.
Example#
# required
table_name = 'h1n1_R2.basins_prevalence_agg'
reference_table = 'reference.gleam-geo-map'
reference_column = 'basin_id' # name of a column in reference table
geo_column = 'basin_id' # name of a column in table corresponding to column in reference table
geo_level = 'basin_label'
geo_values = 'Portland(US-ME)'
value = 'Infectious_18_23'
# Set parameters for grouping
num_clusters = 1
num_features = 20
grouping_method = 'mse' # mean squared error
centrality_method = 'mse' # mean squared error
dataset = None
delete_data = True
fbp_fig = ei.functional_boxplot(
client=client,
table_name=table_name,
reference_table=reference_table,
geo_level=geo_level,
geo_values=geo_values,
geo_column=geo_column,
reference_column=reference_column,
value=value,
num_clusters=num_clusters,
num_features=num_features,
grouping_method=grouping_method,
centrality_method=centrality_method,
dataset=dataset,
delete_data=delete_data,
overwrite=True
)
# finishing touches
fbp_fig.update_layout(width=900, height=500,
showlegend=True,
font_family='PT Sans Narrow',
title='Functional Boxplot',
yaxis_title="Infectious 18-23yo"
)
fbp_fig.show()
Dataset `net-data-viz-handbook.32e14b71bb95e1a4dc22b07241399cf7a417e7340351eaa9672256ee6b3e350f` created.
C:\Users\elija\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\google\cloud\bigquery\table.py:1727: UserWarning:
BigQuery Storage module not found, fetch data with the REST endpoint instead.
BigQuery dataset `net-data-viz-handbook.32e14b71bb95e1a4dc22b07241399cf7a417e7340351eaa9672256ee6b3e350f` removed successfully, or it did not exist.
fixed_time_boxplot#
A fixted-time boxplot uses fixed-time statistics that rank each point at each time step, and use those to construct confidence intervals for each time step. Always plots the median and interquartile range.
Parameters#
client (bigquery.Client): BigQuery client object.
table_name (str): BigQuery table name containing data in ‘dataset.table’ form.
reference_table (str): BigQuery table name containing reference table in ‘dataset.table’ form.
geo_level (str): The name of a column from the reference table. The geographical level used to determine what places are included.
geo_values (str or listlike or None): The source(s) to be included. A value or subset of values from the geo_level column. If None, then all values will be included.
geo_column (str, optional): Name of column in original table containing geography identifier. Defaults to ‘basin_id’.
reference_column (str, optional): Name of column in original table containing the geography corresponding to data in source_column and target_column. Defaults to ‘basin_id’.
value (str, optional): Name of column in the original table containing the importation value to be analyzed. Defaults to ‘value’.
num_clusters (int, optional): Number of clusters that curves will be broken into based on grouping_method. Defaults to 1. Note: raising num_clusters above one significantly increases runtime.
num_features (int, optional): Number of features the kmeans algorithm will use to group curves if num_clusters in greater than 1. Must be less than or equal to number of run_ids in table.
grouping_method (str, optional): Method used to group curves. Must be one of:
'mse'(default): Fixed-time pairwise mean squared error between curves.'abc': Fixed-time pairwise area between curves. Also called mean absolute error.
kmeans_table (str, optional): BigQuery table name containing clustering information in ‘dataset.table’ form. Used when kmeans has already been performed with delete_data=False. Allows function to skip costly kmeans algorithm.
dataset (str or None, optional): Name of BigQuery dataset to store intermediate tables. If None, then random hash value will be used. Defaults to None.
delete_data (bool, optional): If True, then intermediate data tables will not be deleted. Defaults to False.
overwrite (bool, optional): If True, then will not prompt for confirmation if overwriting an existing BigQuery dataset. Defaults to False.
confidence (float, optional): From 0 to 1. Confidence level of interval that will be graphed. Also determines which points are considered outliers.
full_range (bool, optional): If True, then mesh will be drawn around entire envelope, including outliers. Defaults to False.
outlying_points (bool, optional): If True, then outlying points will be graphed. Defaults to True.
Returns#
fig (plotly.graph_objects.Figure): Plotly Figure containing visualization.
Example#
# required
table_name = 'h1n1_R2.basins_prevalence_agg'
reference_table = 'reference.gleam-geo-map'
reference_column = 'basin_id' # name of a column in reference table
geo_column = 'basin_id' # name of a column in table corresponding to column in reference table
geo_level = 'basin_label'
geo_values = 'Portland(US-ME)'
value = 'Infectious_18_23'
# Set parameters for grouping
num_clusters = 1
num_features = 20
grouping_method = 'mse' # mean squared error
confidence = .95
dataset = None
delete_data = True
ft_fig = ei.fixed_time_boxplot(
client,
table_name,
reference_table,
geo_level,
geo_values,
geo_column=geo_column,
reference_column=reference_column,
num_clusters=num_clusters,
num_features=num_features,
grouping_method=grouping_method,
value=value,
dataset=dataset,
delete_data=delete_data,
kmeans_table=False,
confidence=confidence,
full_range=True,
outlying_points=False,
)
# finishing touches
ft_fig.update_layout(width=900, height=500,
showlegend=True,
font_family='PT Sans Narrow',
title='Traditional Boxplot',)
ft_fig.update_layout(showlegend=True)
Dataset `net-data-viz-handbook.078fee40a39ef9a8912e762a6e6dc113d11d3fb7f2be58ac319e13c9b90f900a` created.
BigQuery dataset `net-data-viz-handbook.078fee40a39ef9a8912e762a6e6dc113d11d3fb7f2be58ac319e13c9b90f900a` removed successfully, or it did not exist.
fetch_fixed_time_quantiles#
Allows calculation of custom fixed-time quantiles. Always fetches median.
Parameters#
client (bigquery.Client): BigQuery client object.
table_name (str): BigQuery table name containing data in ‘dataset.table’ form.
reference_table (str): BigQuery table name containing reference table in ‘dataset.table’ form.
confidences (list of float): List of confidences to gather, from 0 to 1. For example, entering .5 will result in the 25th and 75th percentiles being calculated.
geo_level (str): The name of a column from the reference table. The geographical level used to determine what places are included.
geo_values (str or listlike or None): The geographies to be included. A value or subset of values from the geo_level column. If None, then all values will be included.
geo_column (str, optional): Name of column in original table containing geography identifier. Defaults to ‘basin_id’.
reference_column (str, optional): Name of column in original table containing the geography corresponding to data in source_column and target_column. Defaults to ‘basin_id’.
value (str, optional): Name of column in the original table containing the importation value to be analyzed. Defaults to ‘value’.
num_clusters (int, optional): Number of clusters that curves will be broken into based on grouping_method. Defaults to 1. Note: raising num_clusters above one significantly increases runtime.
num_features (int, optional): Number of features the kmeans algorithm will use to group curves if num_clusters in greater than 1. Must be less than or equal to number of run_ids in table.
grouping_method (str, optional): Method used to group curves. Must be one of:
'mse'(default): Fixed-time pairwise mean squared error between curves.'abc': Fixed-time pairwise area between curves. Also called mean absolute error.
kmeans_table (str, optional): BigQuery table name containing clustering information in ‘dataset.table’ form. Used when kmeans has already been performed with delete_data=False. Allows function to skip costly kmeans algorithm.
dataset (str or None, optional): Name of BigQuery dataset to store intermediate tables. If None, then random hash value will be used. Defaults to None.
delete_data (bool, optional): If True, then intermediate data tables will not be deleted. Defaults to False.
overwrite (bool, optional): If True, then will not prompt for confirmation if overwriting an existing BigQuery dataset. Defaults to False.
Returns#
df (pandas.DataFrame): pandas dataframe containing quantiles and median.
Example#
# uses the same parameters as fixed_time_boxplot!
df_ft = ei.boxplots.fetch_fixed_time_quantiles(
client=client,
table_name=table_name,
reference_table=reference_table,
confidences=[.9, .5], # just introduce the confidences parameter
geo_level=geo_level,
geo_values=geo_values,
geo_column=geo_column,
reference_column=reference_column,
num_clusters=num_clusters,
num_features=num_features,
grouping_method=grouping_method,
value=value,
dataset=dataset,
delete_data=delete_data,
kmeans_table=False,
)
df_ft
Dataset `net-data-viz-handbook.ab7c684e00be1a0a79bf20da8828ed72fe66bf1ce58fb99fd4ba1a06a0ade3aa` created.
---------------------------------------------------------------------------
BadRequest Traceback (most recent call last)
Cell In[21], line 2
1 # uses the same parameters as fixed_time_boxplot!
----> 2 df_ft = ei.boxplots.fetch_fixed_time_quantiles(
3 client=client,
4 table_name=table_name,
5 reference_table=reference_table,
6 confidences=[.9, .5], # just introduce the confidences parameter
7 geo_level=geo_level,
8 geo_values=geo_values,
9 geo_column=geo_column,
10 reference_column=reference_column,
11 num_clusters=num_clusters,
12 num_features=num_features,
13 grouping_method=grouping_method,
14 value=value,
15 dataset=dataset,
16 delete_data=delete_data,
17 kmeans_table=False,
18 )
20 df_ft
File ~\Documents\24f-coop\demovenv\Lib\site-packages\epidemic_intelligence\boxplots.py:967, in fetch_fixed_time_quantiles(client, table_name, reference_table, geo_level, geo_values, confidences, geo_column, reference_column, num_clusters, num_features, grouping_method, value, dataset, delete_data, kmeans_table, overwrite)
946 # print(', '.join(clause for clause in conf_clause))
948 fixed_time_quantiles = f'''
949 WITH centroid_data AS (
950 SELECT
(...)
964 ORDER BY CENTROID_ID, date;
965 '''
--> 967 df = client.query(fixed_time_quantiles).result().to_dataframe() # Execute the query to create the table
969 if delete_data:
970 client.delete_dataset(
971 dataset,
972 delete_contents=True, # Set to False if you only want to delete an empty dataset
973 not_found_ok=True # If True, no error is raised if the dataset does not exist
974 )
File ~\Documents\24f-coop\demovenv\Lib\site-packages\google\cloud\bigquery\job\query.py:1681, in QueryJob.result(self, page_size, max_results, retry, timeout, start_index, job_retry)
1676 remaining_timeout = None
1678 if remaining_timeout is None:
1679 # Since is_job_done() calls jobs.getQueryResults, which is a
1680 # long-running API, don't delay the next request at all.
-> 1681 while not is_job_done():
1682 pass
1683 else:
1684 # Use a monotonic clock since we don't actually care about
1685 # daylight savings or similar, just the elapsed time.
File ~\Documents\24f-coop\demovenv\Lib\site-packages\google\api_core\retry\retry_unary.py:293, in Retry.__call__.<locals>.retry_wrapped_func(*args, **kwargs)
289 target = functools.partial(func, *args, **kwargs)
290 sleep_generator = exponential_sleep_generator(
291 self._initial, self._maximum, multiplier=self._multiplier
292 )
--> 293 return retry_target(
294 target,
295 self._predicate,
296 sleep_generator,
297 timeout=self._timeout,
298 on_error=on_error,
299 )
File ~\Documents\24f-coop\demovenv\Lib\site-packages\google\api_core\retry\retry_unary.py:153, in retry_target(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)
149 # pylint: disable=broad-except
150 # This function explicitly must deal with broad exceptions.
151 except Exception as exc:
152 # defer to shared logic for handling errors
--> 153 _retry_error_helper(
154 exc,
155 deadline,
156 sleep,
157 error_list,
158 predicate,
159 on_error,
160 exception_factory,
161 timeout,
162 )
163 # if exception not raised, sleep before next attempt
164 time.sleep(sleep)
File ~\Documents\24f-coop\demovenv\Lib\site-packages\google\api_core\retry\retry_base.py:212, in _retry_error_helper(exc, deadline, next_sleep, error_list, predicate_fn, on_error_fn, exc_factory_fn, original_timeout)
206 if not predicate_fn(exc):
207 final_exc, source_exc = exc_factory_fn(
208 error_list,
209 RetryFailureReason.NON_RETRYABLE_ERROR,
210 original_timeout,
211 )
--> 212 raise final_exc from source_exc
213 if on_error_fn is not None:
214 on_error_fn(exc)
File ~\Documents\24f-coop\demovenv\Lib\site-packages\google\api_core\retry\retry_unary.py:144, in retry_target(target, predicate, sleep_generator, timeout, on_error, exception_factory, **kwargs)
142 for sleep in sleep_generator:
143 try:
--> 144 result = target()
145 if inspect.isawaitable(result):
146 warnings.warn(_ASYNC_RETRY_WARNING)
File ~\Documents\24f-coop\demovenv\Lib\site-packages\google\cloud\bigquery\job\query.py:1630, in QueryJob.result.<locals>.is_job_done()
1607 if job_failed_exception is not None:
1608 # Only try to restart the query job if the job failed for
1609 # a retriable reason. For example, don't restart the query
(...)
1627 # into an exception that can be processed by the
1628 # `job_retry` predicate.
1629 restart_query_job = True
-> 1630 raise job_failed_exception
1631 else:
1632 # Make sure that the _query_results are cached so we
1633 # can return a complete RowIterator.
(...)
1639 # making any extra API calls if the previous loop
1640 # iteration fetched the finished job.
1641 self._reload_query_results(
1642 retry=retry, **reload_query_results_kwargs
1643 )
BadRequest: 400 Unrecognized name: DISINCT at [11:16]; reason: invalidQuery, location: query, message: Unrecognized name: DISINCT at [11:16]
Location: US
Job ID: 53cab13b-8c0f-4960-affe-9c835ea20513